Here, we want to recreate Figure 1A of Lax et al 2017.
In addition to looking at the floor samples like they did, we will look at each sample type separately. In Figure 1A, they use room floor and station floor samples.
library(phyloseq); packageVersion("phyloseq")
## [1] '1.44.0'
# library(Biostrings); packageVersion("Biostrings")
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.2'
library(dplyr); packageVersion("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] '1.1.2'
library(ggrepel); packageVersion("ggrepel")
## [1] '0.9.3'
DADA2_part2 = dir("../..", pattern="DADA2_part2", full.names = T, include.dirs = T)
countsFile = file.path(DADA2_part2, "output", "asv-counts.txt")
counts = read.delim2(countsFile, row.names = 1)
dim(counts)
## [1] 3008 952
counts = counts[rowSums(counts) > 0,]
dim(counts)
## [1] 2915 952
head(row.names(counts))
## [1] "ERR1459793" "ERR1459883" "ERR1460576" "ERR1460577" "ERR1460578"
## [6] "ERR1460579"
head(colnames(counts))
## [1] "ASV.1" "ASV.2" "ASV.3" "ASV.4" "ASV.5" "ASV.6"
metaReduced = dir("../..", pattern="metadata_PRJEB14474", full.names = T, include.dirs = T)
metaFile = file.path(metaReduced, "output", "PRJEB14474_SraRunTable_reduced.txt")
meta = read.delim2(metaFile)
row.names(meta) = meta$ID
message("Read metadata from: ", metaFile)
## Read metadata from: ../../02_review_metadata_PRJEB14474/output/PRJEB14474_SraRunTable_reduced.txt
We will need to describe days as being “pre-opening” or “post-opening”. This information is held is “study_day”. Make a new variable that is a simple category instead of a numeric.
meta$opening = "before opening"
meta$opening[meta$study_day > 0] = "after opening"
We should have meta data for all samples we had data for…and for some samples that have been filtered out. Limit the meta data to only include samples we have in the data.
inData = row.names(counts)
meta = meta[inData, ]
dim(meta)
## [1] 2915 14
assignTax = dir("../..", pattern="assign_ASV_taxonomy", full.names = T, include.dirs = T)
taxaFile = file.path(assignTax, "output/silva/asvTaxaTable_silva-v138.txt")
taxa = read.delim2(taxaFile, row.names = "asvID")
# drop the ASV column
taxaLim = taxa[,grep("ASV", names(taxa), invert = T, value = T)]
taxaMat = as.matrix(taxaLim)
# tmpDir = "../temp"
# suppressWarnings(dir.create(tmpDir, recursive = T))
#
outDir = "../output"
suppressWarnings(dir.create(outDir, recursive = T))
In the legend, Figure 1A is described as:
- Principal coordinate analysis (PCoA) of all floor samples based on weighted UniFrac distance and colored by whether they were taken before or after the hospital’s opening.
# alt syntax:
# doPCoA = function(counts=counts, meta=meta, taxaMat=taxaMat, sampleType="Floor"){
doPCoA = function(sampleType="Floor"){
sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
countslim = counts[sampleSet,]
ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE),
sample_data(meta[sampleSet,]),
tax_table(taxaMat))
ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
po = plot_ordination(ps.prop, ord, shape="surcat", color="opening", title=paste0("bray PCoA - ", paste0(sampleType, collapse=", ")))
return(list(ord=ord, plotA=po))
}
floor = doPCoA(sampleType="Floor")
floor$plotA
ggsave(file.path(outDir, "floor_bray_PCoA.png"))
## Saving 7 x 5 in image
Do the same for each sample type.
sampleTypes = unique(meta$SAMPLE_TYPE)
plotList = as.list(sampleTypes)
names(plotList) = sampleTypes
for (type in sampleTypes){
aplot = doPCoA(sampleType=type)
plotList[[type]] = aplot
show(aplot$plotA)
}
Save images to a pdf.
pdf(file.path(outDir, "allTypes_bray_PCoA.pdf"))
for (type in sampleTypes){
show(plotList[[type]]$plotA)
}
dev.off()
## quartz_off_screen
## 2
a = doPCoA(sampleType=c("Floor", "Corridor Floor"))
show(a$plotA)
ggsave(file.path(outDir, "floors_bray_PCoA.png"), a$plotA)
## Saving 7 x 5 in image
b = doPCoA(sampleType=c("Countertop", "Computer Mouse", "Station Phone"))
show(b$plotA)
ggsave(file.path(outDir, "fig1-B.png"), b$plotA)
## Saving 7 x 5 in image
Now I am curious. Corridor Floor samples separate nicely along axis 1 between pre- and post-opening. What do I see if I plot the axis 1 value against the study day? Do I see leap from one stable state to another? Do I see a gradual change over time?
do.plot2 = function(type, axis="Axis.1"){
y = plotList[[type]]$ord$vectors[,axis]
df = data.frame(pcoa = y,
study_day = meta[names(y),"study_day"],
opening = meta[names(y),"opening"])
dfPost = df[df$study_day > 0, ]
ct = cor.test(x = dfPost$study_day, y = dfPost$pcoa)
plot2 = ggplot(df, aes(x=study_day, y=pcoa, col=opening)) +
geom_point() +
geom_vline(xintercept = 0,
linetype = 1,
col="grey") +
ylab(axis) +
ggtitle(type)
if (ct$p.value < 0.05) {
annot = paste("post-opening cor: ", round(ct$estimate, 2), "; pval:", round(ct$p.value, 4))
plot2 = plot2 +
ggtitle(type, subtitle = annot)
}
show(plot2)
return(plot2)
}
L2 = list(
do.plot2("Corridor Floor"),
do.plot2("Floor", "Axis.1"),
do.plot2("Floor", "Axis.2"),
do.plot2("Cold Tap Water"),
do.plot2("Countertop"),
do.plot2("Computer Mouse"),
do.plot2("Station Phone"),
do.plot2("blank control")
)
pdf(file.path(outDir, "study-day.v.pcoAx.pdf"))
for (p2 in L2){
show(p2)
}
dev.off()
## quartz_off_screen
## 2
Do each sapmle type alongside the blank controls.
doPCoA2 = function(sampleType="Floor"){
sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
countslim = counts[sampleSet,]
ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE),
sample_data(meta[sampleSet,]),
tax_table(taxaMat))
ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
po = plot_ordination(ps.prop, ord, shape="surcat", color="SAMPLE_TYPE", title=paste0("bray PCoA - ", paste0(sampleType, collapse=", ")))
return(list(ord=ord, plotB=po))
}
eachVsBlank = list()
for (type in sampleTypes){
eachVsBlank[[type]] = doPCoA2(sampleType=c("blank control", type))
show(eachVsBlank[[type]]$plotB)
}
pdf(file.path(outDir, "eachType_vBlank_PCoA.pdf"))
for (type in sampleTypes){
show(eachVsBlank[[type]]$plotA)
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
dev.off()
## quartz_off_screen
## 2
eachVsGlove = list()
for (type in sampleTypes){
eachVsGlove[[type]] = doPCoA2(sampleType=c("Glove from Glove Box", type))
show(eachVsGlove[[type]]$plotB)
}
pdf(file.path(outDir, "eachType_vGlove_PCoA.pdf"))
for (type in sampleTypes){
show(eachVsGlove[[type]]$plotA)
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
dev.off()
## quartz_off_screen
## 2
eachVsBlankGlove = list()
for (type in sampleTypes){
eachVsBlankGlove[[type]] = doPCoA2(sampleType=c("blank control", "Glove from Glove Box", type))
show(eachVsBlankGlove[[type]]$plotB)
}
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggrepel_0.9.3 dplyr_1.1.2 ggplot2_3.4.2 phyloseq_1.44.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.3 xfun_0.39 bslib_0.4.2
## [4] rhdf5_2.44.0 Biobase_2.60.0 lattice_0.21-8
## [7] rhdf5filters_1.12.1 vctrs_0.6.2 tools_4.3.0
## [10] bitops_1.0-7 generics_0.1.3 biomformat_1.28.0
## [13] stats4_4.3.0 parallel_4.3.0 tibble_3.2.1
## [16] fansi_1.0.4 highr_0.10 cluster_2.1.4
## [19] pkgconfig_2.0.3 Matrix_1.5-4 data.table_1.14.8
## [22] S4Vectors_0.38.1 lifecycle_1.0.3 GenomeInfoDbData_1.2.10
## [25] farver_2.1.1 compiler_4.3.0 stringr_1.5.0
## [28] textshaping_0.3.6 Biostrings_2.68.1 munsell_0.5.0
## [31] codetools_0.2-19 permute_0.9-7 GenomeInfoDb_1.36.0
## [34] htmltools_0.5.5 sass_0.4.6 RCurl_1.98-1.12
## [37] yaml_2.3.7 pillar_1.9.0 crayon_1.5.2
## [40] jquerylib_0.1.4 MASS_7.3-58.4 cachem_1.0.8
## [43] vegan_2.6-4 iterators_1.0.14 foreach_1.5.2
## [46] nlme_3.1-162 tidyselect_1.2.0 digest_0.6.31
## [49] stringi_1.7.12 reshape2_1.4.4 labeling_0.4.2
## [52] splines_4.3.0 ade4_1.7-22 fastmap_1.1.1
## [55] grid_4.3.0 colorspace_2.1-0 cli_3.6.1
## [58] magrittr_2.0.3 survival_3.5-5 utf8_1.2.3
## [61] ape_5.7-1 withr_2.5.0 scales_1.2.1
## [64] rmarkdown_2.21 XVector_0.40.0 multtest_2.56.0
## [67] igraph_1.4.3 ragg_1.2.5 evaluate_0.21
## [70] knitr_1.42 IRanges_2.34.0 mgcv_1.8-42
## [73] rlang_1.1.1 Rcpp_1.0.10 glue_1.6.2
## [76] BiocGenerics_0.46.0 rstudioapi_0.14 jsonlite_1.8.4
## [79] R6_2.5.1 Rhdf5lib_1.22.0 plyr_1.8.8
## [82] systemfonts_1.0.4 zlibbioc_1.46.0